Anastasiia Kornilova
January 14, 2016
Data from San Francisco Open Data https://data.sfgov.org/
crimes <- read.csv("SFPD_Incidents_-_from_1_January_2003.csv")
crimes <- crimes[complete.cases(crimes), ]
head(crimes)
## IncidntNum Category Descript
## 1 151118570 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED
## 2 156312404 NON-CRIMINAL LOST PROPERTY
## 3 151119283 BURGLARY BURGLARY, FORCIBLE ENTRY
## 4 151119443 LARCENY/THEFT PETTY THEFT BICYCLE
## 5 151118514 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED
## 6 151118564 SUSPICIOUS OCC INVESTIGATIVE DETENTION
## DayOfWeek Date Time PdDistrict Resolution
## 1 Monday 12/28/2015 23:58 MISSION ARREST, BOOKED
## 2 Monday 12/28/2015 23:45 TENDERLOIN NONE
## 3 Monday 12/28/2015 23:30 TARAVAL NONE
## 4 Monday 12/28/2015 23:30 TARAVAL NONE
## 5 Monday 12/28/2015 23:25 SOUTHERN ARREST, BOOKED
## 6 Monday 12/28/2015 23:15 INGLESIDE NONE
## Address X Y
## 1 TREAT AV / 22ND ST -122.4133 37.75576
## 2 500 Block of OFARRELL ST -122.4138 37.78574
## 3 4000 Block of 19TH AV -122.4711 37.71569
## 4 200 Block of MONTICELLO ST -122.4707 37.71878
## 5 HOWARD ST / 6TH ST -122.4072 37.77974
## 6 100 Block of OCEAN AV -122.4393 37.72345
## Location PdId
## 1 (37.7557647906343, -122.413297660808) 1.511186e+13
## 2 (37.7857382950322, -122.413820070176) 1.563124e+13
## 3 (37.7156903218239, -122.471071293807) 1.511193e+13
## 4 (37.7187815894447, -122.470673521408) 1.511194e+13
## 5 (37.7797376338991, -122.407163046954) 1.511185e+13
## 6 (37.7234530529634, -122.439338398681) 1.511186e+13
crimes$Dates <- mdy(crimes$Date)
crimes$hour <- as.factor(substr(crimes$Time, 1, 2))
crimes$year <- year(crimes$Dates)
crimes$day <- day(crimes$Dates)
crimes$month <- month(crimes$Dates)
crimes$DayOfWeek <- factor(crimes$DayOfWeek , levels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday",
"Sunday"))
##
## 2013 2014 2015
## 35665 150162 151799
FALSE Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=sanfrancisco&zoom=13&size=640x640&scale=2&maptype=terrain&sensor=false
FALSE Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=sanfrancisco&sensor=false
## Source: local data frame [6 x 4]
## Groups: PdDistrict, Address, X [6]
##
## X PdDistrict Address count_s
## (dbl) (fctr) (fctr) (int)
## 1 -122.3873 BAYVIEW 22ND ST / ILLINOIS ST 116
## 2 -122.3868 BAYVIEW 800 Block of CHINABASIN ST 121
## 3 -122.4108 CENTRAL 1600 Block of THE EMBARCADERONORTH ST 210
## 4 -122.4060 CENTRAL 200 Block of POST ST 123
## 5 -122.4128 CENTRAL 300 Block of BAY ST 131
## 6 -122.4076 CENTRAL 300 Block of POST ST 107
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=sanfrancisco&zoom=14&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=sanfrancisco&sensor=false
Define new catergory: tourists_threat.
tourists_threat == ‘yes’ for rows where category is in “LARCENY/THEFT”,“VEHICLE THEFT”, “STOLEN PROPERTY” ,“ROBBERY”, “ASSAULT”, “EXTORTION”
Split data into two parts: 80% - train, 20% - test
m1 <- naiveBayes(as.factor(potential_threat) ~ PdDistrict + DayOfWeek + hour, data = train, laplace = 1)
m1
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## no yes
## 0.5767197 0.4232803
##
## Conditional probabilities:
## PdDistrict
## Y BAYVIEW CENTRAL INGLESIDE MISSION
## no 6.419186e-06 1.036313e-01 1.041128e-01 8.722389e-02 1.412670e-01
## yes 8.745922e-06 7.855587e-02 1.334453e-01 8.148576e-02 1.041989e-01
## PdDistrict
## Y NORTHERN PARK RICHMOND SOUTHERN TARAVAL
## no 1.076818e-01 6.528312e-02 5.478775e-02 1.740305e-01 7.735119e-02
## yes 1.487856e-01 5.492439e-02 5.667358e-02 2.165578e-01 6.599673e-02
## PdDistrict
## Y TENDERLOIN
## no 8.462412e-02
## yes 5.936732e-02
##
## DayOfWeek
## Y Monday Tuesday Wednesday Thursday Friday Saturday
## no 0.1421822 0.1430937 0.1464639 0.1422849 0.1533390 0.1426315
## yes 0.1359601 0.1350942 0.1379105 0.1369397 0.1544147 0.1577820
## DayOfWeek
## Y Sunday
## no 0.1300047
## yes 0.1418988
##
## hour
## Y 00 01 02 03 04
## no 0.054404478 0.028364014 0.024756733 0.018164780 0.013260931
## yes 0.041127396 0.030598503 0.023864908 0.014726459 0.009374563
## hour
## Y 05 06 07 08 09
## no 0.011707618 0.016444581 0.029545046 0.039481116 0.044108963
## yes 0.009663145 0.012321603 0.017332447 0.026033650 0.032163845
## hour
## Y 10 11 12 13 14
## no 0.043402911 0.044904876 0.064680736 0.050238774 0.049500629
## yes 0.039238492 0.041879460 0.051017910 0.044948930 0.046837834
## hour
## Y 15 16 17 18 19
## no 0.054597037 0.057665152 0.061227503 0.059122185 0.051548178
## yes 0.051175318 0.055775150 0.064039107 0.076850427 0.073754722
## hour
## Y 20 21 22 23
## no 0.046220699 0.044962643 0.047440242 0.044250173
## yes 0.069574647 0.059963971 0.057777739 0.049959773
Apply our trained model for the test data
##
## pred_nb no yes
## no 32886 21668
## yes 5849 7123
table(pred_nb, test$potential_threat)
##
## pred_nb no yes
## no 32886 21668
## yes 5849 7123